Date: Mon Oct 28 03:12:48 2019
Scientist: Ran Yin
Sequencing (Waksman): Dibyendu Kumar (?)
Statistics: Davit Sargsyan
Principal Investigator: Ah-Ng Kong
This script was developed using DADA2 Pipeline Tutorial (1.12) with tips and tricks from the University of Maryland Shool of Medicine Institute for Genome Sciences (IGS) Microbiome Analysis Workshop (April 8-11, 2019).
FastQ files were downloaded from this Rutgers Box location. A total of 72 files (2 per sample, pair-ended) and a pair of undetermined reads were downloaded.
16s metadata Sep-2019.xlsx meta-data file was created by Ran. It was saved as .CSV file and used by this script.
dt.meta <- fread("data_sep2019/16s metadata Sep-2019.csv")
save(dt.meta,
file = "data_sep2019/dt.meta.RData")
datatable(dt.meta,
caption = "Table 1: Meta-Data",
rownames = FALSE,
class = "cell-border stripe",
options = list(searching = TRUE,
pageLength = 8))
NOTE: moved “Undetermined” samples from the FastQ folder to the docs folder.
# Get FastQ file names----
path <- "fastq_sep2019"
list.files(path = path,
pattern = ".gz")
[1] "190919-01_S1_L001_R1_001.fastq.gz" "190919-01_S1_L001_R2_001.fastq.gz"
[3] "190919-02_S2_L001_R1_001.fastq.gz" "190919-02_S2_L001_R2_001.fastq.gz"
[5] "190919-03_S3_L001_R1_001.fastq.gz" "190919-03_S3_L001_R2_001.fastq.gz"
[7] "190919-04_S4_L001_R1_001.fastq.gz" "190919-04_S4_L001_R2_001.fastq.gz"
[9] "190919-05_S5_L001_R1_001.fastq.gz" "190919-05_S5_L001_R2_001.fastq.gz"
[11] "190919-06_S6_L001_R1_001.fastq.gz" "190919-06_S6_L001_R2_001.fastq.gz"
[13] "190919-07_S7_L001_R1_001.fastq.gz" "190919-07_S7_L001_R2_001.fastq.gz"
[15] "190919-08_S8_L001_R1_001.fastq.gz" "190919-08_S8_L001_R2_001.fastq.gz"
[17] "190919-09_S9_L001_R1_001.fastq.gz" "190919-09_S9_L001_R2_001.fastq.gz"
[19] "190919-10_S10_L001_R1_001.fastq.gz" "190919-10_S10_L001_R2_001.fastq.gz"
[21] "190919-11_S11_L001_R1_001.fastq.gz" "190919-11_S11_L001_R2_001.fastq.gz"
[23] "190919-12_S12_L001_R1_001.fastq.gz" "190919-12_S12_L001_R2_001.fastq.gz"
[25] "190919-13_S13_L001_R1_001.fastq.gz" "190919-13_S13_L001_R2_001.fastq.gz"
[27] "190919-15_S15_L001_R1_001.fastq.gz" "190919-15_S15_L001_R2_001.fastq.gz"
[29] "190919-16_S16_L001_R1_001.fastq.gz" "190919-16_S16_L001_R2_001.fastq.gz"
[31] "190919-17_S17_L001_R1_001.fastq.gz" "190919-17_S17_L001_R2_001.fastq.gz"
[33] "190919-18_S18_L001_R1_001.fastq.gz" "190919-18_S18_L001_R2_001.fastq.gz"
[35] "190919-19_S19_L001_R1_001.fastq.gz" "190919-19_S19_L001_R2_001.fastq.gz"
[37] "190919-20_S20_L001_R1_001.fastq.gz" "190919-20_S20_L001_R2_001.fastq.gz"
[39] "190919-21_S21_L001_R1_001.fastq.gz" "190919-21_S21_L001_R2_001.fastq.gz"
[41] "190919-22_S22_L001_R1_001.fastq.gz" "190919-22_S22_L001_R2_001.fastq.gz"
[43] "190919-23_S23_L001_R1_001.fastq.gz" "190919-23_S23_L001_R2_001.fastq.gz"
[45] "190919-24_S24_L001_R1_001.fastq.gz" "190919-24_S24_L001_R2_001.fastq.gz"
[47] "190919-25_S25_L001_R1_001.fastq.gz" "190919-25_S25_L001_R2_001.fastq.gz"
[49] "190919-26_S26_L001_R1_001.fastq.gz" "190919-26_S26_L001_R2_001.fastq.gz"
[51] "190919-27_S27_L001_R1_001.fastq.gz" "190919-27_S27_L001_R2_001.fastq.gz"
[53] "190919-28_S28_L001_R1_001.fastq.gz" "190919-28_S28_L001_R2_001.fastq.gz"
[55] "190919-29_S29_L001_R1_001.fastq.gz" "190919-29_S29_L001_R2_001.fastq.gz"
[57] "190919-30_S30_L001_R1_001.fastq.gz" "190919-30_S30_L001_R2_001.fastq.gz"
[59] "190919-31_S31_L001_R1_001.fastq.gz" "190919-31_S31_L001_R2_001.fastq.gz"
[61] "190919-32_S32_L001_R1_001.fastq.gz" "190919-32_S32_L001_R2_001.fastq.gz"
[63] "190919-33_S33_L001_R1_001.fastq.gz" "190919-33_S33_L001_R2_001.fastq.gz"
[65] "190919-34_S34_L001_R1_001.fastq.gz" "190919-34_S34_L001_R2_001.fastq.gz"
[67] "190919-35_S35_L001_R1_001.fastq.gz" "190919-35_S35_L001_R2_001.fastq.gz"
[69] "190919-36_S36_L001_R1_001.fastq.gz" "190919-36_S36_L001_R2_001.fastq.gz"
[71] "190919-37_S37_L001_R1_001.fastq.gz" "190919-37_S37_L001_R2_001.fastq.gz"
[73] "190919-38_S38_L001_R1_001.fastq.gz" "190919-38_S38_L001_R2_001.fastq.gz"
[75] "190919-39_S39_L001_R1_001.fastq.gz" "190919-39_S39_L001_R2_001.fastq.gz"
[77] "190919-40_S40_L001_R1_001.fastq.gz" "190919-40_S40_L001_R2_001.fastq.gz"
[79] "190919-41_S41_L001_R1_001.fastq.gz" "190919-41_S41_L001_R2_001.fastq.gz"
[81] "190919-42_S42_L001_R1_001.fastq.gz" "190919-42_S42_L001_R2_001.fastq.gz"
[83] "190919-43_S43_L001_R1_001.fastq.gz" "190919-43_S43_L001_R2_001.fastq.gz"
[85] "190919-44_S44_L001_R1_001.fastq.gz" "190919-44_S44_L001_R2_001.fastq.gz"
[87] "190919-45_S45_L001_R1_001.fastq.gz" "190919-45_S45_L001_R2_001.fastq.gz"
[89] "190919-46_S46_L001_R1_001.fastq.gz" "190919-46_S46_L001_R2_001.fastq.gz"
[91] "190919-47_S47_L001_R1_001.fastq.gz" "190919-47_S47_L001_R2_001.fastq.gz"
[93] "190919-48_S48_L001_R1_001.fastq.gz" "190919-48_S48_L001_R2_001.fastq.gz"
[95] "190919-49_S49_L001_R1_001.fastq.gz" "190919-49_S49_L001_R2_001.fastq.gz"
[97] "190919-50_S50_L001_R1_001.fastq.gz" "190919-50_S50_L001_R2_001.fastq.gz"
[99] "190919-51_S51_L001_R1_001.fastq.gz" "190919-51_S51_L001_R2_001.fastq.gz"
[101] "190919-52_S52_L001_R1_001.fastq.gz" "190919-52_S52_L001_R2_001.fastq.gz"
[103] "190919-53_S53_L001_R1_001.fastq.gz" "190919-53_S53_L001_R2_001.fastq.gz"
[105] "190919-54_S54_L001_R1_001.fastq.gz" "190919-54_S54_L001_R2_001.fastq.gz"
[107] "190919-55_S55_L001_R1_001.fastq.gz" "190919-55_S55_L001_R2_001.fastq.gz"
[109] "190919-56_S56_L001_R1_001.fastq.gz" "190919-56_S56_L001_R2_001.fastq.gz"
[111] "190919-57_S57_L001_R1_001.fastq.gz" "190919-57_S57_L001_R2_001.fastq.gz"
[113] "190919-58_S58_L001_R1_001.fastq.gz" "190919-58_S58_L001_R2_001.fastq.gz"
[115] "190919-59_S59_L001_R1_001.fastq.gz" "190919-59_S59_L001_R2_001.fastq.gz"
[117] "190919-60_S60_L001_R1_001.fastq.gz" "190919-60_S60_L001_R2_001.fastq.gz"
[119] "190919-61_S61_L001_R1_001.fastq.gz" "190919-61_S61_L001_R2_001.fastq.gz"
[121] "190919-62_S62_L001_R1_001.fastq.gz" "190919-62_S62_L001_R2_001.fastq.gz"
[123] "190919-63_S63_L001_R1_001.fastq.gz" "190919-63_S63_L001_R2_001.fastq.gz"
[125] "190919-64_S64_L001_R1_001.fastq.gz" "190919-64_S64_L001_R2_001.fastq.gz"
[127] "190919-65_S65_L001_R1_001.fastq.gz" "190919-65_S65_L001_R2_001.fastq.gz"
[129] "190919-66_S66_L001_R1_001.fastq.gz" "190919-66_S66_L001_R2_001.fastq.gz"
[131] "190919-67_S67_L001_R1_001.fastq.gz" "190919-67_S67_L001_R2_001.fastq.gz"
[133] "190919-68_S68_L001_R1_001.fastq.gz" "190919-68_S68_L001_R2_001.fastq.gz"
[135] "190919-69_S69_L001_R1_001.fastq.gz" "190919-69_S69_L001_R2_001.fastq.gz"
[137] "190919-70_S70_L001_R1_001.fastq.gz" "190919-70_S70_L001_R2_001.fastq.gz"
[139] "190919-71_S71_L001_R1_001.fastq.gz" "190919-71_S71_L001_R2_001.fastq.gz"
[141] "190919-72_S72_L001_R1_001.fastq.gz" "190919-72_S72_L001_R2_001.fastq.gz"
In gray-scale is a heat map of the frequency of each quality score at each base position. The median quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least that position (this is more useful for other sequencing technologies, as Illumina reads are typically all the same lenghth, hence the flat red line).
Source: DADA2 Pipeline Tutorial (1.12) NOTE: the reason the quality seems to be low at the beginning is that the program is using moving averages so there are less data points in the beginning. No trimming is needed on the left.
user system elapsed
379.382 22.293 402.559
user system elapsed
394.521 30.456 425.209
The reads were trimmed approximately to the lenght at which the quality score median (the green line) went below 20.
The forward reads were of a very good quiality. Only last 10 bases were trimmed.
The reverse read were of lower quality and were trimmed at the length of 220 bases.
sample.names <- gsub(x = fnFs,
pattern = "fastq_sep2019/",
replacement = "")
sample.names <- sapply(strsplit(sample.names, "_"),
`[`,
1)
sample.names
filtFs <- gsub(x = fnFs,
pattern = "fastq_sep2019/",
replacement = "filtered_sep2019/")
filtRs <- gsub(x = fnRs,
pattern = "fastq_sep2019/",
replacement = "filtered_sep2019/")
system.time({
out <- filterAndTrim(fwd = fnFs[1:71],
filt = filtFs[1:71],
rev = fnRs[1:71],
filt.rev = filtRs[1:71],
truncLen = c(280,220),
# trimRight = c(20, 80),
maxN = 0,
maxEE = c(2,2),
truncQ = 2,
rm.phix = TRUE,
compress = TRUE,
multithread = mt)
})
save(out,
file = "data_sep2019/out.RData")
gc()
NOTE: parameter learning is computationally intensive, so by default the learnErrors function uses only a subset of the data (the first 1M reads). If the plotted error model does not look like a good fit, try increasing the nreads parameter to see if the fit improves.
Warning in .Internal(gc(verbose, reset, full)) :
closing unused connection 3 (filtered_sep2019/190919-02_S2_L001_R1_001.fastq.gz)
118609120 total bases in 423604 reads from 2 samples will be used for learning the error rates.
user system elapsed
14329.021 243.752 937.785
134966700 total bases in 613485 reads from 3 samples will be used for learning the error rates.
user system elapsed
12752.364 542.757 1386.940
NOTE: for larger datasets (exceeding available RAM) process samples one-by-one. See DADA2 Workflow on Big Data.
user system elapsed
194.490 34.772 229.789
user system elapsed
168.729 37.539 212.623
$`190919-01_S1_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 208682 reads in 78986 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 78986 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 20187 78125 58540 18275 4 ...
$`190919-02_S2_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 214922 reads in 81933 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 81933 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 1150 47016 158 64172 42594 ...
$`190919-03_S3_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 189881 reads in 70538 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 70538 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 6 33002 13125 1272 208 ...
$`190919-04_S4_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 155288 reads in 63486 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 63486 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 975 22567 8179 138 39777 ...
$`190919-05_S5_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 156011 reads in 57772 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 57772 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 7732 33540 37458 7007 1983 ...
$`190919-06_S6_L001_R1_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 97673 reads in 39615 unique sequences
Sequence lengths: min=280, median=280, max=280
$quals: Quality matrix dimension: 39615 280
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 728 36289 2 13607 94 ...
$`190919-01_S1_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 208682 reads in 103497 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 103497 220
Consensus quality scores: min=7, median=37, max=38
$map: Map from reads to unique sequences: 136 46624 83421 59235 1669 ...
$`190919-02_S2_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 214922 reads in 90896 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 90896 220
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 5 259 99 90823 22534 ...
$`190919-03_S3_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 189881 reads in 86196 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 86196 220
Consensus quality scores: min=7, median=37.66667, max=38
$map: Map from reads to unique sequences: 8 80357 855 33627 54441 ...
$`190919-04_S4_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 155288 reads in 66115 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 66115 220
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 55 15922 79 24474 589 ...
$`190919-05_S5_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 156011 reads in 63256 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 63256 220
Consensus quality scores: min=7, median=38, max=38
$map: Map from reads to unique sequences: 490 27673 25016 39985 5347 ...
$`190919-06_S6_L001_R2_001.fastq.gz`
derep-class: R object describing dereplicated sequencing reads
$uniques: 97673 reads in 53393 unique sequences
Sequence lengths: min=220, median=220, max=220
$quals: Quality matrix dimension: 53393 220
Consensus quality scores: min=7, median=37, max=38
$map: Map from reads to unique sequences: 1591 49138 16 52886 314 ...
Notes from IGS Workshop*:
Sample Inference - inferring the sequence variants in each sample.
By default, the dada function processes each sample independently, but pooled processing is available with pool=TRUE and that may give better results for low sampling depths at the cost of increased computation time.
All samples are simultaneously loaded into memory by default. If the datasets approach or exceed available RAM, it is preferable to process samples one-by-one in a streaming fashion: see DADA2 Workflow on Big Data for an example.
Sample 1 - 208682 reads in 78986 unique sequences.
Sample 2 - 214922 reads in 81933 unique sequences.
Sample 3 - 189881 reads in 70538 unique sequences.
Sample 4 - 155288 reads in 63486 unique sequences.
Sample 5 - 156011 reads in 57772 unique sequences.
Sample 6 - 97673 reads in 39615 unique sequences.
Sample 7 - 214478 reads in 75258 unique sequences.
Sample 8 - 214288 reads in 72843 unique sequences.
Sample 9 - 191334 reads in 61862 unique sequences.
Sample 10 - 53368 reads in 23463 unique sequences.
Sample 11 - 218322 reads in 81671 unique sequences.
Sample 12 - 226471 reads in 79718 unique sequences.
Sample 13 - 168460 reads in 55973 unique sequences.
Sample 14 - 155973 reads in 43222 unique sequences.
Sample 15 - 136567 reads in 48486 unique sequences.
Sample 16 - 208622 reads in 59908 unique sequences.
Sample 17 - 182915 reads in 57873 unique sequences.
Sample 18 - 163113 reads in 51279 unique sequences.
Sample 19 - 171191 reads in 50145 unique sequences.
Sample 20 - 164488 reads in 59505 unique sequences.
Sample 21 - 185688 reads in 56813 unique sequences.
Sample 22 - 158746 reads in 52244 unique sequences.
Sample 23 - 206551 reads in 73905 unique sequences.
Sample 24 - 213950 reads in 79507 unique sequences.
Sample 25 - 177075 reads in 63359 unique sequences.
Sample 26 - 171497 reads in 55533 unique sequences.
Sample 27 - 143271 reads in 59668 unique sequences.
Sample 28 - 231392 reads in 79508 unique sequences.
Sample 29 - 49208 reads in 20277 unique sequences.
Sample 30 - 224062 reads in 81241 unique sequences.
Sample 31 - 187071 reads in 67570 unique sequences.
Sample 32 - 207234 reads in 70740 unique sequences.
Sample 33 - 241967 reads in 84555 unique sequences.
Sample 34 - 209402 reads in 74361 unique sequences.
Sample 35 - 236361 reads in 87074 unique sequences.
Sample 36 - 188008 reads in 63184 unique sequences.
Sample 37 - 158415 reads in 47595 unique sequences.
Sample 38 - 161082 reads in 55731 unique sequences.
Sample 39 - 147077 reads in 45031 unique sequences.
Sample 40 - 191261 reads in 65684 unique sequences.
Sample 41 - 147237 reads in 52735 unique sequences.
Sample 42 - 159965 reads in 48532 unique sequences.
Sample 43 - 137943 reads in 48604 unique sequences.
Sample 44 - 132901 reads in 45884 unique sequences.
Sample 45 - 207400 reads in 70536 unique sequences.
Sample 46 - 160149 reads in 61436 unique sequences.
Sample 47 - 259652 reads in 93727 unique sequences.
Sample 48 - 214571 reads in 76197 unique sequences.
Sample 49 - 230455 reads in 78839 unique sequences.
Sample 50 - 150264 reads in 50275 unique sequences.
Sample 51 - 226126 reads in 77727 unique sequences.
Sample 52 - 208034 reads in 72041 unique sequences.
Sample 53 - 183836 reads in 62762 unique sequences.
Sample 54 - 224006 reads in 71252 unique sequences.
Sample 55 - 133868 reads in 49056 unique sequences.
Sample 56 - 143077 reads in 52425 unique sequences.
Sample 57 - 129351 reads in 41721 unique sequences.
Sample 58 - 283741 reads in 89756 unique sequences.
Sample 59 - 190064 reads in 58799 unique sequences.
Sample 60 - 266255 reads in 87105 unique sequences.
Sample 61 - 179351 reads in 50073 unique sequences.
Sample 62 - 234205 reads in 67939 unique sequences.
Sample 63 - 177428 reads in 60817 unique sequences.
Sample 64 - 272638 reads in 83969 unique sequences.
Sample 65 - 199273 reads in 67858 unique sequences.
Sample 66 - 190721 reads in 66355 unique sequences.
Sample 67 - 169510 reads in 57976 unique sequences.
Sample 68 - 162187 reads in 59392 unique sequences.
Sample 69 - 199451 reads in 57811 unique sequences.
Sample 70 - 139848 reads in 52531 unique sequences.
Sample 71 - 166140 reads in 55056 unique sequences.
user system elapsed
42860.672 673.845 2845.200
Sample 1 - 208682 reads in 103497 unique sequences.
Sample 2 - 214922 reads in 90896 unique sequences.
Sample 3 - 189881 reads in 86196 unique sequences.
Sample 4 - 155288 reads in 66115 unique sequences.
Sample 5 - 156011 reads in 63256 unique sequences.
Sample 6 - 97673 reads in 53393 unique sequences.
Sample 7 - 214478 reads in 79778 unique sequences.
Sample 8 - 214288 reads in 85795 unique sequences.
Sample 9 - 191334 reads in 71023 unique sequences.
Sample 10 - 53368 reads in 25345 unique sequences.
Sample 11 - 218322 reads in 92721 unique sequences.
Sample 12 - 226471 reads in 89153 unique sequences.
Sample 13 - 168460 reads in 62088 unique sequences.
Sample 14 - 155973 reads in 59139 unique sequences.
Sample 15 - 136567 reads in 53949 unique sequences.
Sample 16 - 208622 reads in 71042 unique sequences.
Sample 17 - 182915 reads in 70159 unique sequences.
Sample 18 - 163113 reads in 56386 unique sequences.
Sample 19 - 171191 reads in 61120 unique sequences.
Sample 20 - 164488 reads in 65183 unique sequences.
Sample 21 - 185688 reads in 67193 unique sequences.
Sample 22 - 158746 reads in 60145 unique sequences.
Sample 23 - 206551 reads in 82695 unique sequences.
Sample 24 - 213950 reads in 93871 unique sequences.
Sample 25 - 177075 reads in 80371 unique sequences.
Sample 26 - 171497 reads in 78211 unique sequences.
Sample 27 - 143271 reads in 63112 unique sequences.
Sample 28 - 231392 reads in 106092 unique sequences.
Sample 29 - 49208 reads in 24183 unique sequences.
Sample 30 - 224062 reads in 91240 unique sequences.
Sample 31 - 187071 reads in 83433 unique sequences.
Sample 32 - 207234 reads in 86049 unique sequences.
Sample 33 - 241967 reads in 105897 unique sequences.
Sample 34 - 209402 reads in 95474 unique sequences.
Sample 35 - 236361 reads in 105002 unique sequences.
Sample 36 - 188008 reads in 80720 unique sequences.
Sample 37 - 158415 reads in 63854 unique sequences.
Sample 38 - 161082 reads in 76356 unique sequences.
Sample 39 - 147077 reads in 58928 unique sequences.
Sample 40 - 191261 reads in 82852 unique sequences.
Sample 41 - 147237 reads in 69891 unique sequences.
Sample 42 - 159965 reads in 62194 unique sequences.
Sample 43 - 137943 reads in 64172 unique sequences.
Sample 44 - 132901 reads in 59242 unique sequences.
Sample 45 - 207400 reads in 91543 unique sequences.
Sample 46 - 160149 reads in 75375 unique sequences.
Sample 47 - 259652 reads in 116138 unique sequences.
Sample 48 - 214571 reads in 91073 unique sequences.
Sample 49 - 230455 reads in 94375 unique sequences.
Sample 50 - 150264 reads in 63699 unique sequences.
Sample 51 - 226126 reads in 89190 unique sequences.
Sample 52 - 208034 reads in 85216 unique sequences.
Sample 53 - 183836 reads in 80119 unique sequences.
Sample 54 - 224006 reads in 81950 unique sequences.
Sample 55 - 133868 reads in 59602 unique sequences.
Sample 56 - 143077 reads in 61456 unique sequences.
Sample 57 - 129351 reads in 53374 unique sequences.
Sample 58 - 283741 reads in 112043 unique sequences.
Sample 59 - 190064 reads in 76593 unique sequences.
Sample 60 - 266255 reads in 105528 unique sequences.
Sample 61 - 179351 reads in 67605 unique sequences.
Sample 62 - 234205 reads in 94784 unique sequences.
Sample 63 - 177428 reads in 70163 unique sequences.
Sample 64 - 272638 reads in 100917 unique sequences.
Sample 65 - 199273 reads in 84800 unique sequences.
Sample 66 - 190721 reads in 73108 unique sequences.
Sample 67 - 169510 reads in 71255 unique sequences.
Sample 68 - 162187 reads in 69693 unique sequences.
Sample 69 - 199451 reads in 75430 unique sequences.
Sample 70 - 139848 reads in 60610 unique sequences.
Sample 71 - 166140 reads in 64584 unique sequences.
user system elapsed
26880.854 958.056 2893.947
user system elapsed
401.827 16.374 418.130
user system elapsed
0.683 0.489 1.181
[1] 71 41923
user system elapsed
13226.705 11.186 444.675
[1] 71 8097
[1] "Chimeras = 60.2%"
NOTE: According to the IGS, denovo chimeras are determined based on most abundant sequencins in a given data. Usually 5-7% of sequences are chimeras. It is much higher in this dataset (60.2%). IGS recommends revisiting the removal of primers, as the ambiguous nucleotides in unremoved primers interfere with chimera identification.
load("data_sep2019/out.RData")
getN <- function(x) {
sum(getUniques(x))
}
track <- cbind(out,
sapply(dadaFs,
getN),
sapply(mergers,
getN),
rowSums(seqtab),
rowSums(seqtab.nochim))
colnames(track) <- c("Raw",
"Filtered",
"Denoised",
"Merged",
"Tabled",
"Non-Chimeras")
rownames(track) <- sample.names
DT::datatable(format(track,
big.mark = ","),
options = list(pageLength = nrow(track)))
IGS suggests the number of merged sequences can potentially be increased by truncating the reads less (truncLen parameter in the filterAndTrim function), specifically, making sure that the truncated reads span the amplicon. This might not be the case here as the remaining reads are relatively long (280 bases for forward and 220 reads for reverse reads).
Write out and save your results thus far:
fc <- file("data_sep2019/all_runs_dada2_ASV.fasta")
fltp <- character()
for( i in 1:ncol(seqtab)) {
fltp <- append(fltp,
paste0(">Seq_",
i))
fltp <- append(fltp,
colnames(seqtab)[i])
}
Warning in paste0(">Seq_", i) :
closing unused connection 3 (data_may2019/all_runs_dada2_ASV.fasta)
writeLines(fltp,
fc)
close(fc)
head(fltp)
[1] ">Seq_1"
[2] "GTGTCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCTGTTTCGTAAGTCGTGTGTGAAAGGCGCGGGCTCAACCCGCGGACGGCACATGATACTGCGAGACTAGAGTAATGGAGGGGGAACCGGAATTCTCGGTGTAGCAGTGAAATGCGTAGATATCGAGAGGAACACTCGTGGCGAAGGCGGGTTCCTGGACATTAACTGACGCTGAGGCACGAAGGCCAGGGGAGCGAAAGGGATTAGAAACCCCCGTAGTCC"
[3] ">Seq_2"
[4] "GTGTCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCTGTTTCGTAAGTCGTGTGTGAAAGGCGCGGGCTCAACCCGCGGACGGCACATGATACTGCGAGACTAGAGTAATGGAGGGGGAACCGGAATTCTCGGTGTAGCAGTGAAATGCGTAGATATCGAGAGGAACACTCGTGGCGAAGGCGGGTTCCTGGACATTAACTGACGCTGAGGCACGAAGGCCAGGGGAGCGAAAGGGATTAGATACCCCCGTAGTCC"
[5] ">Seq_3"
[6] "GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCTGTTTCGTAAGTCGTGTGTGAAAGGCGCGGGCTCAACCCGCGGACGGCACATGATACTGCGAGACTAGAGTAATGGAGGGGGAACCGGAATTCTCGGTGTAGCAGTGAAATGCGTAGATATCGAGAGGAACACTCGTGGCGAAGGCGGGTTCCTGGACATTAACTGACGCTGAGGCACGAAGGCCAGGGGAGCGAAAGGGATTAGAAACCCCCGTAGTCC"
rm(fltp)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 13465694 719.2 20238321 1080.9 20238321 1080.9
Vcells 2758006606 21042.0 3943654098 30087.7 3943650900 30087.7
NOTE: create taxa.RData once, then comment it out and load the R data file to when reruning the code.
# taxa <- assignTaxonomy(seqs = seqtab.nochim,
# refFasta = "tax/silva_nr_v132_train_set.fa",
# multithread = mt)
# save(taxa,
# file = "data_sep2019/taxa.RData")
load("data_sep2019/taxa.RData")
print(paste("Number of unique references =",
format(nrow(taxa),
big.mark = ",")))
datatable(taxa[1:5, ],
rownames = FALSE)
# Keep only the references found in the data
taxa.tmp <- taxa[rownames(taxa) %in% colnames(seqtab.nochim), ]
print(paste("Number of references matched in the data =",
format(nrow(taxa.tmp),
big.mark = ",")))
# # Add species (do it once)
# taxa.plus <- addSpecies(taxtab = taxa.tmp,
# refFasta = "tax/silva_species_assignment_v132.fa",
# verbose = TRUE)
# save(taxa.plus,
# file = "data_may2019/taxa.plus.RData")
#
# load("data_sep2019/taxa.plus.RData")
dt.otu <- otu_table(seqtab.nochim,
taxa_are_rows = FALSE)
sample_names(dt.otu) <- sample.names
print("Sample names in OTU table")
[1] "Sample names in OTU table"
sample_names(dt.otu)
[1] "190919-01" "190919-02" "190919-03" "190919-04" "190919-05" "190919-06"
[7] "190919-07" "190919-08" "190919-09" "190919-10" "190919-11" "190919-12"
[13] "190919-13" "190919-15" "190919-16" "190919-17" "190919-18" "190919-19"
[19] "190919-20" "190919-21" "190919-22" "190919-23" "190919-24" "190919-25"
[25] "190919-26" "190919-27" "190919-28" "190919-29" "190919-30" "190919-31"
[31] "190919-32" "190919-33" "190919-34" "190919-35" "190919-36" "190919-37"
[37] "190919-38" "190919-39" "190919-40" "190919-41" "190919-42" "190919-43"
[43] "190919-44" "190919-45" "190919-46" "190919-47" "190919-48" "190919-49"
[49] "190919-50" "190919-51" "190919-52" "190919-53" "190919-54" "190919-55"
[55] "190919-56" "190919-57" "190919-58" "190919-59" "190919-60" "190919-61"
[61] "190919-62" "190919-63" "190919-64" "190919-65" "190919-66" "190919-67"
[67] "190919-68" "190919-69" "190919-70" "190919-71" "190919-72"
metadata <- sample_data(dt.meta)
rownames(metadata) <- metadata$SAMPLE_NAME
print("Sample names in metadata")
[1] "Sample names in metadata"
sample_names(metadata)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
[16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
[31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
[46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
[61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
metadata@row.names <- sample_names(dt.otu)
ps_sep2019 <- phyloseq(dt.otu,
metadata,
tax_table(taxa))
sample_names(ps_sep2019)
[1] "190919-01" "190919-02" "190919-03" "190919-04" "190919-05" "190919-06"
[7] "190919-07" "190919-08" "190919-09" "190919-10" "190919-11" "190919-12"
[13] "190919-13" "190919-15" "190919-16" "190919-17" "190919-18" "190919-19"
[19] "190919-20" "190919-21" "190919-22" "190919-23" "190919-24" "190919-25"
[25] "190919-26" "190919-27" "190919-28" "190919-29" "190919-30" "190919-31"
[31] "190919-32" "190919-33" "190919-34" "190919-35" "190919-36" "190919-37"
[37] "190919-38" "190919-39" "190919-40" "190919-41" "190919-42" "190919-43"
[43] "190919-44" "190919-45" "190919-46" "190919-47" "190919-48" "190919-49"
[49] "190919-50" "190919-51" "190919-52" "190919-53" "190919-54" "190919-55"
[55] "190919-56" "190919-57" "190919-58" "190919-59" "190919-60" "190919-61"
[61] "190919-62" "190919-63" "190919-64" "190919-65" "190919-66" "190919-67"
[67] "190919-68" "190919-69" "190919-70" "190919-71" "190919-72"
save(ps_sep2019,
file = "data_sep2019/ps_sep2019.RData")
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server 7.5 (Maipo)
Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] DT_0.6 stringr_1.4.0 ggplot2_3.2.0 phyloseq_1.26.1
[5] dada2_1.10.1 Rcpp_1.0.1 data.table_1.12.2 kableExtra_1.1.0
[9] knitr_1.23
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] matrixStats_0.54.0 webshot_0.5.1
[5] RColorBrewer_1.1-2 httr_1.4.0
[7] GenomeInfoDb_1.18.2 tools_3.5.0
[9] R6_2.4.0 vegan_2.5-5
[11] lazyeval_0.2.2 BiocGenerics_0.28.0
[13] mgcv_1.8-23 colorspace_1.4-1
[15] permute_0.9-5 ade4_1.7-13
[17] withr_2.1.2 tidyselect_0.2.5
[19] compiler_3.5.0 rvest_0.3.4
[21] Biobase_2.42.0 xml2_1.2.0
[23] DelayedArray_0.8.0 labeling_0.3
[25] scales_1.0.0 readr_1.3.1
[27] digest_0.6.19 Rsamtools_1.34.1
[29] rmarkdown_1.13 XVector_0.22.0
[31] base64enc_0.1-3 pkgconfig_2.0.2
[33] htmltools_0.3.6 htmlwidgets_1.3
[35] rlang_0.4.0 rstudioapi_0.10
[37] shiny_1.3.2 hwriter_1.3.2
[39] jsonlite_1.6 crosstalk_1.0.0
[41] BiocParallel_1.16.6 dplyr_0.8.1
[43] RCurl_1.95-4.12 magrittr_1.5
[45] GenomeInfoDbData_1.2.0 biomformat_1.10.1
[47] Matrix_1.2-14 munsell_0.5.0
[49] S4Vectors_0.20.1 Rhdf5lib_1.4.3
[51] ape_5.3 stringi_1.4.3
[53] yaml_2.2.0 MASS_7.3-49
[55] SummarizedExperiment_1.12.0 zlibbioc_1.28.0
[57] rhdf5_2.26.2 plyr_1.8.4
[59] grid_3.5.0 promises_1.0.1
[61] parallel_3.5.0 crayon_1.3.4
[63] lattice_0.20-35 Biostrings_2.50.2
[65] splines_3.5.0 multtest_2.38.0
[67] hms_0.4.2 pillar_1.4.0
[69] igraph_1.2.4.1 GenomicRanges_1.34.0
[71] reshape2_1.4.3 codetools_0.2-15
[73] stats4_3.5.0 glue_1.3.1
[75] evaluate_0.14 ShortRead_1.40.0
[77] latticeExtra_0.6-28 RcppParallel_4.4.2
[79] httpuv_1.5.1 foreach_1.4.4
[81] gtable_0.3.0 purrr_0.3.2
[83] assertthat_0.2.1 xfun_0.7
[85] mime_0.6 xtable_1.8-4
[87] later_0.8.0 survival_2.41-3
[89] viridisLite_0.3.0 tibble_2.1.3
[91] iterators_1.0.10 GenomicAlignments_1.18.1
[93] IRanges_2.16.0 cluster_2.0.7-1